In [263]:
import pandas as pd
In [264]:
data = pd.read_csv('stocks.csv')
In [265]:
print(data.head())
Ticker Date Open High Low Close \
0 AAPL 07-02-2023 150.639999 155.229996 150.639999 154.649994
1 AAPL 08-02-2023 153.880005 154.580002 151.169998 151.919998
2 AAPL 09-02-2023 153.779999 154.330002 150.419998 150.869995
3 AAPL 10-02-2023 149.460007 151.339996 149.220001 151.009995
4 AAPL 13-02-2023 150.949997 154.259995 150.919998 153.850006
Adj Close Volume
0 154.414230 83322600
1 151.688400 64120100
2 150.639999 56007100
3 151.009995 57450700
4 153.850006 62199000
In [266]:
data=data.dropna() #Handling missing values
In [436]:
#Correlation Heatmap – Feature Relationships
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.heatmap(data[["Close", "Volume", "Open", "High", "Low"]].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()
In [267]:
data.plot.line(y="Close", use_index=True)
Out[267]:
<Axes: >
In [268]:
data.info() #checking if the delete action is performed, as well as the other column's information
<class 'pandas.core.frame.DataFrame'> RangeIndex: 248 entries, 0 to 247 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ticker 248 non-null object 1 Date 248 non-null object 2 Open 248 non-null float64 3 High 248 non-null float64 4 Low 248 non-null float64 5 Close 248 non-null float64 6 Adj Close 248 non-null float64 7 Volume 248 non-null int64 dtypes: float64(5), int64(1), object(2) memory usage: 15.6+ KB
In [435]:
#Volume vs. Price Movement
fig, ax1 = plt.subplots(figsize=(12,6))
ax1.set_xlabel("Date")
ax1.set_ylabel("Closing Price", color="blue")
ax1.plot(data["Close"], label="Closing Price", color="blue")
ax1.tick_params(axis="y", labelcolor="blue")
ax2 = ax1.twinx()
ax2.set_ylabel("Volume", color="green")
ax2.bar(data.index, data["Volume"], color="green", alpha=0.3)
ax2.tick_params(axis="y", labelcolor="green")
plt.title("Stock Price vs. Trading Volume")
plt.show()
#High volume with price movement = strong trend confirmation.
In [269]:
data["Tomorrow"]= data["Close"].shift(-1) #Creating a new columnn "Tomorrow" which holds the next day's closing stock price
In [270]:
data
Out[270]:
| Ticker | Date | Open | High | Low | Close | Adj Close | Volume | Tomorrow | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | AAPL | 07-02-2023 | 150.639999 | 155.229996 | 150.639999 | 154.649994 | 154.414230 | 83322600 | 151.919998 |
| 1 | AAPL | 08-02-2023 | 153.880005 | 154.580002 | 151.169998 | 151.919998 | 151.688400 | 64120100 | 150.869995 |
| 2 | AAPL | 09-02-2023 | 153.779999 | 154.330002 | 150.419998 | 150.869995 | 150.639999 | 56007100 | 151.009995 |
| 3 | AAPL | 10-02-2023 | 149.460007 | 151.339996 | 149.220001 | 151.009995 | 151.009995 | 57450700 | 153.850006 |
| 4 | AAPL | 13-02-2023 | 150.949997 | 154.259995 | 150.919998 | 153.850006 | 153.850006 | 62199000 | 153.199997 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 243 | GOOG | 01-05-2023 | 107.720001 | 108.680000 | 107.500000 | 107.709999 | 107.709999 | 20926300 | 105.980003 |
| 244 | GOOG | 02-05-2023 | 107.660004 | 107.730003 | 104.500000 | 105.980003 | 105.980003 | 20343100 | 106.120003 |
| 245 | GOOG | 03-05-2023 | 106.220001 | 108.129997 | 105.620003 | 106.120003 | 106.120003 | 17116300 | 105.209999 |
| 246 | GOOG | 04-05-2023 | 106.160004 | 106.300003 | 104.699997 | 105.209999 | 105.209999 | 19780600 | 106.214996 |
| 247 | GOOG | 05-05-2023 | 105.320000 | 106.440002 | 104.738998 | 106.214996 | 106.214996 | 20705300 | NaN |
248 rows × 9 columns
In [271]:
data["Target"]=(data["Tomorrow"]>data["Close"]).astype(int)
In [272]:
data
Out[272]:
| Ticker | Date | Open | High | Low | Close | Adj Close | Volume | Tomorrow | Target | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AAPL | 07-02-2023 | 150.639999 | 155.229996 | 150.639999 | 154.649994 | 154.414230 | 83322600 | 151.919998 | 0 |
| 1 | AAPL | 08-02-2023 | 153.880005 | 154.580002 | 151.169998 | 151.919998 | 151.688400 | 64120100 | 150.869995 | 0 |
| 2 | AAPL | 09-02-2023 | 153.779999 | 154.330002 | 150.419998 | 150.869995 | 150.639999 | 56007100 | 151.009995 | 1 |
| 3 | AAPL | 10-02-2023 | 149.460007 | 151.339996 | 149.220001 | 151.009995 | 151.009995 | 57450700 | 153.850006 | 1 |
| 4 | AAPL | 13-02-2023 | 150.949997 | 154.259995 | 150.919998 | 153.850006 | 153.850006 | 62199000 | 153.199997 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 243 | GOOG | 01-05-2023 | 107.720001 | 108.680000 | 107.500000 | 107.709999 | 107.709999 | 20926300 | 105.980003 | 0 |
| 244 | GOOG | 02-05-2023 | 107.660004 | 107.730003 | 104.500000 | 105.980003 | 105.980003 | 20343100 | 106.120003 | 1 |
| 245 | GOOG | 03-05-2023 | 106.220001 | 108.129997 | 105.620003 | 106.120003 | 106.120003 | 17116300 | 105.209999 | 0 |
| 246 | GOOG | 04-05-2023 | 106.160004 | 106.300003 | 104.699997 | 105.209999 | 105.209999 | 19780600 | 106.214996 | 1 |
| 247 | GOOG | 05-05-2023 | 105.320000 | 106.440002 | 104.738998 | 106.214996 | 106.214996 | 20705300 | NaN | 0 |
248 rows × 10 columns
In [273]:
data.shape
Out[273]:
(248, 10)
In [344]:
#Training an initial ML model
from sklearn.ensemble import RandomForestClassifier #choosing RandomForestClassifier due to it's accuracy, avoids overfitting better than others, and can pick non-linear tendencies in the data.
model= RandomForestClassifier(n_estimators=185, min_samples_split=100, random_state=1, class_weight="balanced") #creating the model
#"n_estimators" are the no' of individual decision trees we want to train - higher they are, higher the accuracy is. "min_samples_split" this will help us to protect from overfitting. "random_state=1" will help us to get same results all the time we run the model, or model's results will be predictible.
train= data.iloc[ :-100]
test= data.iloc[-100: ]
predictors= ["Open","High","Low","Close","Volume"]
model.fit(train[predictors], train["Target"])
Out[344]:
RandomForestClassifier(class_weight='balanced', min_samples_split=100,
n_estimators=185, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', min_samples_split=100,
n_estimators=185, random_state=1)In [345]:
#Measuring the aaccuracy of the model
from sklearn.metrics import precision_score
preds= model.predict(test[predictors]) #preds is the prediction score
In [346]:
preds= pd.Series(preds, index=test.index)
In [347]:
print(set(preds))
{0, 1}
In [348]:
precision_score(test["Target"], preds)
Out[348]:
0.5205479452054794
In [349]:
#plotting the predictions
combined= pd.concat([test["Target"], preds], axis=1)
In [350]:
combined.plot()
Out[350]:
<Axes: >
In [351]:
def predict(train, test, predictors, model):
model.fit(train[predictors], train["Target"])
preds= model.predict(test[predictors])
preds= pd.Series(preds, index=test.index, name="Predictions")
combined=pd.concat([test["Target"], preds], axis=1)
return combined
In [438]:
#Rolling Volatility – Risk Measurement
data["Volatility"] = data["Close"].pct_change().rolling(window=30).std()
plt.figure(figsize=(12, 6))
plt.plot(data["Volatility"], label="30-Day Rolling Volatility", color="purple")
plt.xlabel("Date")
plt.ylabel("Volatility")
plt.title("Stock Rolling Volatility Over Time")
plt.legend()
plt.show()
In [372]:
#Building a backtesting system
def backtest(data1, model, predictors, start=50, step=10): #Testing per year, by taking 10 years of data
all_predictions= [] #Creating a list in which each data frame stores the predicted value, for a year
for i in range(start, data1.shape[0], step):
train= data1.iloc[:i].copy() #Contains all years prior to the current year
test= data1.iloc[i:(i+step)].copy() #Contains the current year
predictions= predict(train, test, predictors, model)
if predictions is not None and not predictions.empty:
all_predictions.append(predictions)
if len(all_predictions) == 0:
raise ValueError("No predictions generated. Check model training and prediction functions.")
return pd.concat(all_predictions,ignore_index=True) #Combines all data frames into a single data frame
In [373]:
predictions= backtest(data, model, predictors)
In [374]:
predictions["Predictions"].value_counts() #counts the no' of predicted times, for each type of value
Out[374]:
Predictions 1 109 0 89 Name: count, dtype: int64
In [375]:
data["Target"].value_counts()
Out[375]:
Target 0 129 1 119 Name: count, dtype: int64
In [376]:
precision_score(predictions["Target"], predictions["Predictions"])
Out[376]:
0.41284403669724773
In [377]:
predictions["Target"].value_counts() / predictions.shape[0]
Out[377]:
Target 0 0.535354 1 0.464646 Name: count, dtype: float64
In [441]:
import plotly.express as px
fig = px.line(data, x=data.index, y="Close", title="Interactive Stock Price Chart")
fig.show()
In [398]:
#Adding additional predictors to our model, to improve accuracy
horizons= [2,5,30,60] #horizons are the rolling means, i.e we will calculate mean of close price in the last 2 days, 5, 30, &60 days, and will find the ratio between today's closing price and closing price in those periods. Performing this to improve predictions
new_predictors= []
data = data.select_dtypes(include=["number"])
for horizon in horizons:
rolling_averages= data.rolling(horizon).mean()
ratio_column= f"Close_Ratio_{horizon}" #creating new column
data[ratio_column]= data["Close"] / rolling_averages["Close"]
trend_column= f"Trend_{horizon}" #This column holds the no' of days in the past X days(i.e days in horizon) that the stock price went up
data[trend_column]= data.shift(1).rolling(horizon).sum()["Target"] #This will calculate the sum of the 1's in target, i.e the trend when stock went up and has predicted corectly
new_predictors+= [ratio_column, trend_column]
In [399]:
data
Out[399]:
| Open | High | Low | Close | Adj Close | Volume | Tomorrow | Target | Close_Ratio_2 | Trend_2 | Close_Ratio_5 | Trend_5 | Close_Ratio_30 | Trend_30 | Close_Ratio_60 | Trend_60 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 150.639999 | 155.229996 | 150.639999 | 154.649994 | 154.414230 | 83322600 | 151.919998 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 153.880005 | 154.580002 | 151.169998 | 151.919998 | 151.688400 | 64120100 | 150.869995 | 0 | 0.991095 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 153.779999 | 154.330002 | 150.419998 | 150.869995 | 150.639999 | 56007100 | 151.009995 | 1 | 0.996532 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 149.460007 | 151.339996 | 149.220001 | 151.009995 | 151.009995 | 57450700 | 153.850006 | 1 | 1.000464 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 150.949997 | 154.259995 | 150.919998 | 153.850006 | 153.850006 | 62199000 | 153.199997 | 0 | 1.009316 | 2.0 | 1.009117 | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 243 | 107.720001 | 108.680000 | 107.500000 | 107.709999 | 107.709999 | 20926300 | 105.980003 | 0 | 0.997638 | 0.0 | 1.009731 | 1.0 | 1.021180 | 13.0 | 1.000531 | 29.0 |
| 244 | 107.660004 | 107.730003 | 104.500000 | 105.980003 | 105.980003 | 20343100 | 106.120003 | 1 | 0.991904 | 0.0 | 0.990967 | 1.0 | 1.003494 | 13.0 | 1.018325 | 28.0 |
| 245 | 106.220001 | 108.129997 | 105.620003 | 106.120003 | 106.120003 | 17116300 | 105.209999 | 0 | 1.000660 | 1.0 | 0.989187 | 2.0 | 1.004731 | 13.0 | 1.056318 | 28.0 |
| 246 | 106.160004 | 106.300003 | 104.699997 | 105.209999 | 105.209999 | 19780600 | 106.214996 | 1 | 0.995694 | 1.0 | 0.986516 | 1.0 | 0.995804 | 13.0 | 1.047752 | 28.0 |
| 247 | 105.320000 | 106.440002 | 104.738998 | 106.214996 | 106.214996 | 20705300 | NaN | 0 | 1.004753 | 1.0 | 0.999699 | 2.0 | 1.005330 | 13.0 | 1.056670 | 29.0 |
248 rows × 16 columns
In [400]:
#Improving the model
model= RandomForestClassifier(n_estimators=100, min_samples_split=50, random_state=1, )
In [401]:
data
Out[401]:
| Open | High | Low | Close | Adj Close | Volume | Tomorrow | Target | Close_Ratio_2 | Trend_2 | Close_Ratio_5 | Trend_5 | Close_Ratio_30 | Trend_30 | Close_Ratio_60 | Trend_60 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 150.639999 | 155.229996 | 150.639999 | 154.649994 | 154.414230 | 83322600 | 151.919998 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 153.880005 | 154.580002 | 151.169998 | 151.919998 | 151.688400 | 64120100 | 150.869995 | 0 | 0.991095 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 153.779999 | 154.330002 | 150.419998 | 150.869995 | 150.639999 | 56007100 | 151.009995 | 1 | 0.996532 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 149.460007 | 151.339996 | 149.220001 | 151.009995 | 151.009995 | 57450700 | 153.850006 | 1 | 1.000464 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 150.949997 | 154.259995 | 150.919998 | 153.850006 | 153.850006 | 62199000 | 153.199997 | 0 | 1.009316 | 2.0 | 1.009117 | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 243 | 107.720001 | 108.680000 | 107.500000 | 107.709999 | 107.709999 | 20926300 | 105.980003 | 0 | 0.997638 | 0.0 | 1.009731 | 1.0 | 1.021180 | 13.0 | 1.000531 | 29.0 |
| 244 | 107.660004 | 107.730003 | 104.500000 | 105.980003 | 105.980003 | 20343100 | 106.120003 | 1 | 0.991904 | 0.0 | 0.990967 | 1.0 | 1.003494 | 13.0 | 1.018325 | 28.0 |
| 245 | 106.220001 | 108.129997 | 105.620003 | 106.120003 | 106.120003 | 17116300 | 105.209999 | 0 | 1.000660 | 1.0 | 0.989187 | 2.0 | 1.004731 | 13.0 | 1.056318 | 28.0 |
| 246 | 106.160004 | 106.300003 | 104.699997 | 105.209999 | 105.209999 | 19780600 | 106.214996 | 1 | 0.995694 | 1.0 | 0.986516 | 1.0 | 0.995804 | 13.0 | 1.047752 | 28.0 |
| 247 | 105.320000 | 106.440002 | 104.738998 | 106.214996 | 106.214996 | 20705300 | NaN | 0 | 1.004753 | 1.0 | 0.999699 | 2.0 | 1.005330 | 13.0 | 1.056670 | 29.0 |
248 rows × 16 columns
In [402]:
import numpy as np
def predict(train, test, predictors, model):
train = train.copy()
test = test.copy()
# Fill missing values with column means
train[predictors] = train[predictors].apply(lambda x: x.fillna(x.mean()))
test[predictors] = test[predictors].apply(lambda x: x.fillna(x.mean()))
model.fit(train[predictors], train["Target"])
preds= model.predict_proba(test[predictors])[:,1] #This returns the probability of the stock price that goes up or down, and selecting the 2nd column which will give the probability that stock price goes up
# Apply custom threshold (0.6)
preds = np.where(preds >= 0.6, 1, 0) # 1 if >= 0.6, else 0
#preds[preds >= .6] = 1 #Setting our custom threshold, by default it is "0.5".so if it greater than .6 there is a chance that price will go up.
#pred[ preds < .6] = 0
return pd.DataFrame({"Predictions": preds}, index=test.index)
#preds= pd.Series(preds, index=tesst.index, name="Predictions")
#combined=pd.concat([test["Target"], preds], axis=1)
#return combined
In [403]:
print("Missing values in train set:\n", train[predictors].isna().sum())
print("Missing values in test set:\n", test[predictors].isna().sum())
Missing values in train set: Open 0 High 0 Low 0 Close 0 Volume 0 dtype: int64 Missing values in test set: Open 0 High 0 Low 0 Close 0 Volume 0 dtype: int64
In [404]:
train.shape
Out[404]:
(148, 10)
In [406]:
test.shape
Out[406]:
(100, 10)
In [412]:
pd.Series(new_predictors).value_counts()
Out[412]:
Close_Ratio_2 1 Trend_2 1 Close_Ratio_5 1 Trend_5 1 Close_Ratio_30 1 Trend_30 1 Close_Ratio_60 1 Trend_60 1 Name: count, dtype: int64
In [405]:
data
Out[405]:
| Open | High | Low | Close | Adj Close | Volume | Tomorrow | Target | Close_Ratio_2 | Trend_2 | Close_Ratio_5 | Trend_5 | Close_Ratio_30 | Trend_30 | Close_Ratio_60 | Trend_60 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 150.639999 | 155.229996 | 150.639999 | 154.649994 | 154.414230 | 83322600 | 151.919998 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 153.880005 | 154.580002 | 151.169998 | 151.919998 | 151.688400 | 64120100 | 150.869995 | 0 | 0.991095 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 153.779999 | 154.330002 | 150.419998 | 150.869995 | 150.639999 | 56007100 | 151.009995 | 1 | 0.996532 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 149.460007 | 151.339996 | 149.220001 | 151.009995 | 151.009995 | 57450700 | 153.850006 | 1 | 1.000464 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 150.949997 | 154.259995 | 150.919998 | 153.850006 | 153.850006 | 62199000 | 153.199997 | 0 | 1.009316 | 2.0 | 1.009117 | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 243 | 107.720001 | 108.680000 | 107.500000 | 107.709999 | 107.709999 | 20926300 | 105.980003 | 0 | 0.997638 | 0.0 | 1.009731 | 1.0 | 1.021180 | 13.0 | 1.000531 | 29.0 |
| 244 | 107.660004 | 107.730003 | 104.500000 | 105.980003 | 105.980003 | 20343100 | 106.120003 | 1 | 0.991904 | 0.0 | 0.990967 | 1.0 | 1.003494 | 13.0 | 1.018325 | 28.0 |
| 245 | 106.220001 | 108.129997 | 105.620003 | 106.120003 | 106.120003 | 17116300 | 105.209999 | 0 | 1.000660 | 1.0 | 0.989187 | 2.0 | 1.004731 | 13.0 | 1.056318 | 28.0 |
| 246 | 106.160004 | 106.300003 | 104.699997 | 105.209999 | 105.209999 | 19780600 | 106.214996 | 1 | 0.995694 | 1.0 | 0.986516 | 1.0 | 0.995804 | 13.0 | 1.047752 | 28.0 |
| 247 | 105.320000 | 106.440002 | 104.738998 | 106.214996 | 106.214996 | 20705300 | NaN | 0 | 1.004753 | 1.0 | 0.999699 | 2.0 | 1.005330 | 13.0 | 1.056670 | 29.0 |
248 rows × 16 columns
In [428]:
predictions["Predictions"].value_counts()
Out[428]:
Predictions 0 149 1 35 Name: count, dtype: int64
In [430]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(data["Close"], label="Closing Price", color="blue")
plt.xlabel("Date")
plt.ylabel("Stock Price")
plt.title("Stock Price Trend Over Time")
plt.legend()
plt.show()
In [431]:
data["50_MA"] = data["Close"].rolling(window=50).mean() # 50-day moving average
data["200_MA"] = data["Close"].rolling(window=200).mean()
plt.figure(figsize=(12, 6))
plt.plot(data["Close"], label="Closing Price", color="blue")
plt.plot(data["50_MA"], label="50-Day MA", color="orange")
plt.plot(data["200_MA"], label="200-Day MA", color="red")
plt.xlabel("Date")
plt.ylabel("Stock Price")
plt.title("Stock Price with Moving Averages")
plt.legend()
plt.show()
#Identifies trends, bullish/bearish crossovers, and long-term patterns.
In [ ]: